library(tidyverse)
library(tidymodels)
library(workflows)
library(magrittr)
source('data_retrieval.R')
source('preprocess.R')
source('modeling.R')
source('data_retrieval.R')
source('analysis.R')

revision_cancer_events <- get_revision_cancer_events_data(con) %>% 
  filter(EVE_event_number == 0 )   


set.seed(42)
(revision_cancer_events_split <- 
    revision_cancer_events  
  %>% add_UTL_features()
  %>% eol_cleaning_pipeline()
  %>% create_hashed_split(0.5)
)



calibration_proportion <- get_calibration_proportion(revision_cancer_events_split$train, DMG_died_within_365d)
downsampled_train <- downsample_data(revision_cancer_events_split$train, DMG_died_within_365d)

(initial_model_params <- list(tree_depth = 8, learn_rate = 0.123001965989824,
                              loss_reduction = 11.1582974204794, min_n = 2.44744561146945,
                              sample_size = 0.773196286475286,
                              mtry = 0.549052911438048, trees = 400)
)

tic <- Sys.time() 
best_params <- run_bayes_optimisation(downsampled_train, DMG_died_within_365d, initial_model_params, 50) %>%
  select_best() %>% 
  select(-.config)


# train model 
print(paste0("Hyperparamter search took ", difftime(Sys.time(), tic, units = "hours"), " hours!"))
best_model_spec <- get_xgboost_spec() %>% set_args(nthread = 40, !!!best_params)

toc <- Sys.time()
(downsampled_train 
  %>% train_model(best_model_spec, outcome = DMG_died_within_365d)
  -> xgboost_fitted
)

print(paste0("Model training took ", difftime(Sys.time(), toc, units = "hours"), " hours!"))

preds_test <- get_model_preds(model = xgboost_fitted, 
                              data = revision_cancer_events_split$test, 
                              true_outcome = "DMG_died_within_365d")


preds_test_calibrate <- calibrate_preds(preds_test, calibration_proportion, .pred_1)

roc_auc <- roc_auc(preds_test_calibrate , truth = true_value, .pred_1)

feature_importance <- get_feature_importance(xgboost_fitted)
fit_xgboost <- pull_workflow_fit(xgboost_fitted)

cancer_revision_p0 = list(fit_xgboost = fit_xgboost,
                                            preds_test = preds_test_calibrate,
                                            roc_auc = roc_auc, 
                                            feature_importance = feature_importance)
save(cancer_revision_p0,
     file = "revision_p0.RData")

#########################################################################################
# get auc for train-set (using some object from the top of the script)


initial_predict <- get_load_new(file_name = "revision_p0.RData")
 

recipe <- make_matrix_recipe(downsampled_train, DMG_died_within_365d) %>% prep()
train_data_preprocessed <- recipe %>% juice()


preds <- predict(initial_predict$fit_xgboost,
                 train_data_preprocessed %>% select(-DMG_died_within_365d), 
                 type = "prob")

preds_train_calibrate <- calibrate_preds(preds, calibration_proportion, .pred_1)

roc_auc_train <- roc_auc(preds_train_calibrate ,
                         truth = train_data_preprocessed$DMG_died_within_365d,
                         preds_after_bayes)


rbind(data.frame(initial_predict$roc_auc) %>% mutate(split = "test"), 
                   data.frame(roc_auc_train) %>% mutate(split = "train")) %>%
  mutate(sample = "p0") %>% 
  write.csv("p0_canc_AUC.csv",row.names = T)


